#Dennis Moskov, Master Thesis
#MLR for prediction
#conversion, selectivity and yield
#Split by unique articles and CV

#use database with article number

#randomly shuffle the data
set.seed(77)                      # seed for reproducibility
DBs<-DB[sample(nrow(DB)),]

#initiate possible results
results<-rbind(c("Conversion","Selectivity","Yield"),c("X.MeOH","S.MeOH","Y.MeOH"),c(length(DBs)-2,length(DBs)-1,length(DBs)))


#loop through different outcomes
for (r in 1:3) {

#use desired outcome
useDB<-DBs[-c(as.numeric(results[3,-r]))]

#initiate results lists
res<-list()
res.names<-c("number","fold","predicted","observed","residuals","residuals_squared")
Anova<-list()
summ<-list()


#divide data by article number
ref.num.CV <- unique(DBs[,1])

#choose number of folds
k <- 5

#build folds with unique article numbers
for(j in 1:k){
  if(j<k){
    change.index <- ref.num.CV[trunc(length(ref.num.CV)/k*(j-1)+1):trunc(length(ref.num.CV)/k*j)]
  }else{
    change.index <- ref.num.CV[trunc(length(ref.num.CV)/k*(j-1)+1):(length(ref.num.CV))]
  }
  for(l in 1:length(change.index)){
    useDB[(DBs[,1]==change.index[l]),1] = j
  }
}

#initiate matrices for regression and prediction values
reg<-matrix(, nrow = 10,ncol = k)
colnames(reg, do.NULL = FALSE)
colnames(reg) <-  colnames(reg, do.NULL = FALSE, prefix = "fold ")
rownames(reg) <- c("Sample","RSS","TSS","MSS","R","R","adj.R","MSE","RMSE","SDEC")
predic<-matrix(,nrow=4,ncol=1)
colnames(predic) <-  "overall"
rownames(predic) <- c("PRESS","Q","PSE","SDEP")

#initiate matrix for p-values for importance analysis
pval<-vector("list", k)
pvaln<-vector("list", k)

#perform k fold cross validation
for(i in 1:k){
    #segement data by fold  
    testIndexes <- which(useDB[,1]==i,arr.ind=TRUE)
    testData <- useDB[testIndexes, ]               #test data fold k
    testData <- testData[-1]			#remove fold column
    trainData <- useDB[-testIndexes, ]             #training data fold k
    trainData <- trainData[-1]			#remove fold column
    
    #multiple linear regression and prediction
    form <- paste(names(trainData)[length(trainData)], "~", paste(names(trainData)[-length(trainData)], collapse=" + "))
    fit<-lm(form, data=trainData)

    #make sure test and train sets have same level range for prep.meth
    fit$xlevels[["prep.meth"]] <- union(fit$xlevels[["prep.meth"]], levels(testData$prep.meth))
    fit$xlevels[["hClusdendo"]] <- union(fit$xlevels[["hClusdendo"]], levels(testData$hClusdendo))  #for clustered data base hierarchical
    fit$xlevels[["clustering"]] <- union(fit$xlevels[["clustering"]], levels(testData$clustering))  #for clustered data base PAM
 

    #predict
    pred<-predict(object=fit, newdata=testData)

    #save results of each fold
    res$number<-c(res$number,as.numeric(names(pred)))
    res$fold<-c(res$fold,rep(i,length(pred)))
    res$predicted<-c(res$predicted,unname(pred))
    
    #save anova and summary results for value extraction and significance analysis
    Anova[[i]]<-anova(fit)
    summ[[i]]<-summary(fit)
    pvaln[[i]]<-rownames(Anova[[i]][5])
    pval[[i]]<-Anova[[i]][5][,1]

    #values for model fitness for each fold
    reg["Sample",i]<-length(summ[[i]][[3]])                 #number of datapoints in training set of the fold
    reg["RSS",i]<-Anova[[i]][[2]][length(Anova[[i]][[2]])]  #Residual Sum of Squares
    reg["TSS",i]<-sum(Anova[[i]][[2]])                      #Total Sum of Squares
    reg["MSS",i]<-reg["TSS",i]-reg["RSS",i]                 #Model Sum of Squares
    reg["R",i]<-summ[[i]][[8]]                             #coefficient of determination
    reg["R",i]<-sqrt(reg["R",i])                           #multiple correlation coefficient
    reg["adj.R",i]<-summ[[i]][[9]]                         #Adjusted coefficient of determination
    reg["RMSE",i]<-summ[[i]][[6]]                           #Root Mean square error (residual standard deviation)
    reg["MSE",i]<-(reg["RMSE",i])^2                         #Mean square error
    reg["SDEC",i]<-sqrt(reg["RSS",i]/reg["Sample",i])       #Standard Deviation Error in Calculation
}

#average values for model fitness
reg<-cbind(reg,rowSums(reg)/k)
colnames(reg)[length(colnames(reg))]<-"overall"

#calculate residuals
res$observed<-useDB[,length(useDB)]
res$residuals<-res$observed-res$predicted
res$residuals_squared<-(res$residuals)^2


#values for goodness of prediction                                     
predic["PRESS",1]<-sum(res$residuals_squared)                            #Predictive Error Sum of Squares
predic["Q",1]<-1-(predic["PRESS",1]/reg["TSS","overall"])               #Cross-validated predictive R2
predic["PSE",1]<-predic["PRESS",1]/length(res$residuals_squared)         #predictive squared error
predic["SDEP",1]<-sqrt(predic["PSE",1])                                  #standard deviation error in prediction


#matrix of used variables in each fold
max.len <- max(sapply(pval, length))
corrected.list <- lapply(pval, function(x) {c(x, rep(NA, max.len - length(x)))})
pval <- do.call(cbind, corrected.list)
colnames(pval, do.NULL = FALSE)
colnames(pval) <-  colnames(pval, do.NULL = FALSE, prefix = "fold ")

#matrix of used variable names in each fold
max.len <- max(sapply(pvaln, length))
corrected.list <- lapply(pvaln, function(x) {c(x, rep(NA, max.len - length(x)))})
pvaln <- do.call(cbind, corrected.list)
colnames(pvaln, do.NULL = FALSE)
colnames(pvaln) <-  colnames(pvaln, do.NULL = FALSE, prefix = "fold ")

pval<-cbind(pvaln[,1],pval[,1],pvaln[,2],pval[,2],pvaln[,3],pval[,3],pvaln[,4],pval[,4],pvaln[,5],pval[,5])

#--------------------------------PLOT------------------------------------------------------------------------------------
#plot predicted vs. observed
x11()
plot(res$predicted,res$observed,pch=16, col=res$fold, xlim=c(0,1),ylim=c(0,1),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Predicted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(1:k), lty ="longdash",title = "folds",col=unique(res$fold))


x11()
plot(res$predicted,res$observed,pch=16, col=res$fold,xlim=c(min(0,min(res$predicted)),max(max(res$observed),max(res$predicted))),ylim=c(min(0,min(res$predicted)),max(max(res$observed),max(res$predicted))),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Predicted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(1:k), lty ="longdash",title = "folds",col=unique(res$fold))


#plot predicted vs. residuals
x11() 
plot(res$predicted,res$residuals, col=res$fold, xlim=c(0,1),ylim=c(-1,1),xlab=paste("Predicted MeOH",results[1,r]), xaxs="i",yaxs="i",ylab="Residuals",main=paste("Predicted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)

x11() 
plot(res$predicted,res$residuals, col=res$fold, xlim=c(min(res$predicted),max(res$predicted)), ylim=c(min(res$residuals),max(res$residuals)),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab="Residuals",main=paste("Predicted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)


#plot residual density
x11()
plot(density(res$residuals),xlab="Residuals", ylab="Density",main=paste("Density Plot of Residuals for",results[1,r]))

#--------------------------------DISPLAY------------------------------------------------------

#display insignificant variables pvalue >0.05
#pval[(pval[,"overall"]>0.05),"overall"]
View(reg,paste("Regression Values for MeOH",results[1,r]))
View(predic,paste("Prediction Values for MeOH",results[1,r]))
View(pval,paste("p-values for importance analysis for MeOH",results[1,r]))


#---------------------------------SAVE------------------------------------------------------------------------------------
#plot predicted vs. observed
png(filename=paste(results[1,r]," predVSobs full.png"))
par(new=TRUE, pch=16)
plot(res$predicted,res$observed,pch=16, col=res$fold,xlim=c(0,1),ylim=c(0,1),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Predicted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(1:k), lty ="longdash",title = "folds",col=unique(res$fold))
dev.off()

png(filename=paste(results[1,r]," predVSobs cropped.png"))
par(new=TRUE, pch=16)
plot(res$predicted,res$observed,pch=16, col=res$fold,xlim=c(min(0,min(res$predicted)),max(max(res$observed),max(res$predicted))),ylim=c(min(0,min(res$predicted)),max(max(res$observed),max(res$predicted))),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Predicted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(1:k), lty ="longdash",title = "folds",col=unique(res$fold))
dev.off()


#plot predicted vs. residuals
png(filename=paste(results[1,r]," predVSres full.png"))
plot(res$predicted,res$residuals, col=res$fold,xlim=c(0,1),ylim=c(-1,1),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab="Residuals",main=paste("Predicted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)
dev.off()

png(filename=paste(results[1,r]," predVSres cropped.png"))
plot(res$predicted,res$residuals, col=res$fold,xlim=c(min(res$predicted),max(res$predicted)), ylim=c(min(res$residuals),max(res$residuals)),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab="Residuals",main=paste("Predicted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)
dev.off()

#plot residual density
png(filename=paste(results[1,r]," resDensity.png"))
plot(density(res$residuals),xlab="Residuals", ylab="Density",main=paste("Density Plot of Residuals for",results[1,r]))
dev.off()

#regression and prediction results
write.csv(reg, file =paste(results[1,r]," regression_values.csv"))
write.csv(predic, file =paste(results[1,r]," prediction_values.csv"))
write.csv(res, file =paste(results[1,r]," results.csv"), row.names=FALSE)
capture.output(summ, file =paste(results[1,r], " Summary.txt"))
capture.output(Anova, file =paste(results[1,r], " Anova.txt"))
write.csv(pval, file =paste(results[1,r]," p-values.csv"))

}


















